Grafana 监控实例

快速启动 Prometheus 服务

快速启动一个本地的 Prometheus 用来测试

version: '3.4'
services:
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    hostname: prometheus
    ports:
      - 9090:9090
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
      - "--web.console.libraries=/usr/share/prometheus/console_libraries"
      - "--web.console.templates=/usr/share/prometheus/consoles"
      - "--web.enable-lifecycle"
  grafana:
    image: grafana/grafana
    container_name: grafana
    hostname: grafana
    ports:
      - 3000:3000
    volumes:
      - ./grafana.ini:/etc/grafana/grafana.ini:rw
      # 记得修改权限
      - ./data/prometheus/grafana_data:/var/lib/grafana

grafana.ini 配置文件直接使用默认的就可以了

这里介绍一下 prometheus.yml 的配置

global:
  scrape_interval: 15s # 默认15s 全局每次数据收集的间隔
  evaluation_interval: 15s # 规则扫描时间间隔是15秒，默认不填写是 1分钟
  scrape_timeout: 5s    #超时时间
  external_labels: # 用于外部系统标签的，不是用于metrics(度量)数据
 
# Alertmanager configuration
# 这里定义和 prometheus 集成的 alertmanager 插件，
# 用于监控报警。后续会单独进行 alertmanger 插件的配置、配置说明、报警媒介以及 route 路由规则记录。
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      # - alertmanager:9093
 
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
# 这个主要是用来设置告警规则，基于设定什么指标进行报警（类似触发器trigger）。
# 这里设定好规则以后，prometheus 会根据全局 global 设定的 evaluation_interval 参数进行扫描加载，规则改动后会自动加载。
# 其报警媒介和 route 路由由 alertmanager 插件实现。
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
 
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=` to any timeseries scraped from this config.
  - job_name: 'prometheus'
 
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
 
    static_configs:
    - targets: ['10.11.40.7:55492']
      labels:
        group: 'dev'

准备一个测试服务

这里参考官方的 Go SDK 文档

go get github.com/prometheus/client_golang/prometheus
go get github.com/prometheus/client_golang/prometheus/promauto
go get github.com/prometheus/client_golang/prometheus/promhttp

package main

import (
        "net/http"

        "github.com/prometheus/client_golang/prometheus/promhttp"
)

func main() {
        go mockRequest()
        http.Handle("/metrics", promhttp.Handler())
        http.ListenAndServe(":2112", nil)
}

func mockRequest() {
	for {
		time.Sleep(time.Second * time.Duration(rand.Intn(2)))
		// 模拟一段随机请求
		http.Get("http://localhost:2112/xxxxxx")
	}
}

服务暴露出端点

http://localhost:2112/metrics

监控下单失败的频率

拓展一下上面的服务，增加一个下单的业务

var (
	submitErrorCounter = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "submit_error_total",
		},
		[]string{},
	)
)

func init() {
	prometheus.MustRegister(submitErrorCounter)
}

func main() {
	go mockRequest()

	http.Handle("/metrics", promhttp.Handler())
	http.Handle("/submit", http.HandlerFunc(submitHandler))
	http.ListenAndServe(":2112", nil)
}

func submitHandler(w http.ResponseWriter, r *http.Request) {
	// 模拟一段随机下单失败
	if rand.Intn(10) == 0 {
		submitErrorCounter.With(prometheus.Labels{}).Inc()
		w.WriteHeader(http.StatusInternalServerError)
		return
	}

	w.WriteHeader(http.StatusOK)
}

如上，我们在 /submit 这个端点下模拟了一个下单的业务，监控下单失败的频率

20230303135820

配置 Alertmanager

Alertmanager 进程作用是接收来自 Prometheus 进程的报警规则消息后, 进行下一步动作(通知到人), 而 Alertmanager 并不真正关心指标(规则)的具体细节, 而只关心告警规则的 labels 用于进行路由分类将告警以合适的方式发送到应当通知到的人而不骚扰其他人。

Prometheus 的 Alertmanager 需要单独安装，这里我们使用 docker 安装

version: "3"

networks:
     monitor:
        driver: bridge

services:
  alertmanager:
      image: prom/alertmanager
      container_name: alertmanager
      hostname: alertmanager
      ports:
          - '9093:9093'
      volumes:
          - ./data/alertmanager:/alertmanager/data:rw
          - ./alertmanager.yml:/alertmanager.yml
      command:
          - "--config.file=/alertmanager.yml"
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    hostname: prometheus
    ports:
      - 9090:9090
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./data/rules/:/etc/prometheus/rules/:rw
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
      - "--web.console.libraries=/usr/share/prometheus/console_libraries"
      - "--web.console.templates=/usr/share/prometheus/consoles"
      - "--web.enable-lifecycle"

然后可以在 Prometheus 里面加上报警规则，我们新创建一个配置文件：alertmanager.yml，把它和 prometheus.yml 放在一起

global:
  # 邮件报警设置 都可以在 receiver单独配置
  smtp_from: example@demo.com
  smtp_smarthost: smtp.example.org:587
  smtp_auth_username: example@demo.com
  smtp_auth_password: password
  smtp_require_tls: false # 协议是否使用 tls 需要注意默认是 true

  # 报警时调用api 暂略

route: # 路由 这将会是一个树形数据结构, 如果不满足任何子节点 才会使用本节点配置 核心数据是规则的label
  receiver: default # 接收者  下面会定义
  group_by: ['serverity'] # 分组使用的labels 属性  如果是 ... 则使用所有labels分组

  group_interval: 1m # 针对该组发送报警邮件的间隔 间隔内多封报警会集合后发送一封
  repeat_interval: 20m # 相同报警邮件发送频率间隔 如报警a b两个规则邮件发送后, c也触发了则算是新的报警 abc 1m 后一起发送

receivers: # 接收者
  - name: default
    email_configs: # 接受者的邮件设置
      - to: all@demo.com

命令执行成功后，在浏览器中访问：http://localhost:9093/ 默认端口9093

编写 rules 文件，放在 ./data/rules/ 目录下

# alert.rules
groups:
  - name: alert.rules
    rules:
    - alert: submit_order_err
      expr: (rate(submit_error_total[1m])) > 0.2
      for: 2m
      annotations:
        description: 提交订单接口错误率大于 20% , (current value is {{ $value }})
        summary: Dev 提交订单错误告警

将 Alertmanager 添加到 Prometheus，前面我们说了，告警规则是配置在 Prometheus Servers 上，然后发送报警信息到 AlertManger 中的，那么接下来我们把 Alertmanager 添加到 Prometheus 中。

alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - alertmanager:9093

rule_files:
  - "/etc/prometheus/rules/*.yaml"
  - "/etc/prometheus/rules/*.yml"

重启 Prometheus 服务，然后在 Prometheus 的 Web UI 中，点击 Alerting，可以看到 Alertmanager 的状态

20230303150521

快速启动 Prometheus 服务​

准备一个测试服务​

监控下单失败的频率​

配置 Alertmanager​

快速启动 Prometheus 服务

准备一个测试服务

监控下单失败的频率

配置 Alertmanager